Notes: we use scatterplot to visualize the relationship between two continuous variables.
library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv', sep ='\t')
qplot(x= age, y= friend_count, data= pf)
Response: Younger users have a lot of friends.
Notes: ggplot let us specify more complicated plots.
ggplot(aes(x= age, y= friend_count), data= pf)+
geom_point() + xlim(13, 90)
## Warning: Removed 4906 rows containing missing values (geom_point).
summary(pf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 20.00 28.00 37.28 50.00 113.00
Notes: overplotting makes it different to tell how many points are in each region, so we can set the transparency of the points using the alpha parameter and geom point.
ggplot(aes(x= age, y= friend_count), data= pf)+
geom_jitter(alpha=1/20) +
xlim(13, 90)
## Warning: Removed 5189 rows containing missing values (geom_point).
Response: The friend count for young users aren’t nearly as high as they looked before, the bulk of young users really have friend counts below 1000.
Notes: with this plot, it’s much easier to see the distribution of friend count.
ggplot(aes(x= age, y= friend_count), data= pf)+
geom_point(alpha=1/20, position= position_jitter(h=0)) +
xlim(13, 90)+
coord_trans(y= 'sqrt')
## Warning: Removed 5187 rows containing missing values (geom_point).
Notes: we use alpha to reduce overplotting
ggplot(aes(x= age, y= friendships_initiated), data= pf)+
geom_point(alpha= 1/20, position= position_jitter(h=0))+
coord_trans(y='sqrt')
Notes: Created another data frame that contains the mean and the median of friend count for each age.
library("dplyr")
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
pf.fc_by_age <-
pf %>%
group_by(age) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n<- n()) %>%
arrange(age)
head(pf.fc_by_age, 20)
## # A tibble: 20 x 4
## age friend_count_mean friend_count_median `n <- n()`
## <int> <dbl> <dbl> <int>
## 1 13 165. 74 484
## 2 14 251. 132 1925
## 3 15 348. 161 2618
## 4 16 352. 172. 3086
## 5 17 350. 156 3283
## 6 18 331. 162 5196
## 7 19 334. 157 4391
## 8 20 283. 135 3769
## 9 21 236. 121 3671
## 10 22 211. 106 3032
## 11 23 203. 93 4404
## 12 24 186. 92 2827
## 13 25 131. 62 3641
## 14 26 144. 75 2815
## 15 27 134. 72 2240
## 16 28 126. 66 2364
## 17 29 121. 66 1936
## 18 30 115. 67.5 1716
## 19 31 118. 63 1694
## 20 32 114. 63 1443
Create your plot!
ggplot(aes(x= age, y= friend_count_mean), data= pf.fc_by_age) +
geom_line()
Notes: displaying multiple summaries at the same time on the plot.
ggplot(aes(x= age, y= friend_count), data= pf)+
xlim(13, 90) +
geom_point(alpha= 0.05,
position= position_jitter(h = 0),
color= 'orange')+
coord_trans(y = 'sqrt') +
geom_line( stat= 'summary', fun.y= mean) +
geom_line(stat= 'summary', fun.y= quantile,
fun.args= list(probs = .1),
linetype= 2, color= 'blue') +
geom_line(stat= 'summary', fun.y= quantile,
fun.args= list(probs = .5),
color= 'blue') +
geom_line(stat= 'summary', fun.y= quantile,
fun.args= list(probs = .9),
linetype= 2, color= 'blue')
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5162 rows containing missing values (geom_point).
Response: more than 1000 friend is rare, 90% have less than 1000 friends.
Notes: We used the Pearson product moment correlation to measure the linear relationship between age and friend count.
cor.test(pf$age, pf$friend_count, method= 'pearson')
##
## Pearson's product-moment correlation
##
## data: pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
Look up the documentation for the cor.test function.
What’s the correlation between age and friend count? Round to three decimal places. Response: -0.027
Notes: We don’t want to include the older ages in our correlation number, since older ages are likely to be incorrect.
with(subset(pf, age <= 70), cor.test(age, friend_count))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1780220 -0.1654129
## sample estimates:
## cor
## -0.1717245
Notes: Correlation Methods: Pearson’s r, Spearman’s rho, and Kendall’s tau.
Notes: We will look at the number of likes users recieved from friends on the desktop version on the site and compare it to the total number of likes users recieved.
ggplot(aes(x= www_likes_received, y= likes_received), data = pf) +
geom_point()
ggplot(aes(x= www_likes_received, y= likes_received), data = pf) +
geom_point() +
xlim(0, quantile(pf$www_likes_received, 0.95))+
ylim(0, quantile(pf$likes_received, 0.95)) +
geom_smooth(method = 'lm', color = 'red')
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).
cor.test(pf$www_likes_received, pf$likes_received)
##
## Pearson's product-moment correlation
##
## data: pf$www_likes_received and pf$likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
library(alr3)
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
data("Mitchell")
Create your plot!
ggplot(data = Mitchell, aes(x= Month, y= Temp)) +
geom_point()
cor.test(Mitchell$Month, Mitchell$Temp)
##
## Pearson's product-moment correlation
##
## data: Mitchell$Month and Mitchell$Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08053637 0.19331562
## sample estimates:
## cor
## 0.05747063
Notes: We should break months into 12 as we have 12 months in the year
ggplot(data = Mitchell, aes(x= Month, y= Temp)) +
geom_point() +
scale_x_continuous(breaks = seq(0, 203, 12))
What do you notice? Response: When stretching out of the graph, we notice that we get more of a cyclical pattern because there are seasons in Nebraska.
ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age) +
geom_line()
pf$age_with_months <- pf$age + (12- pf$dob_month)/12
pf.fc_by_age_months <- pf %>%
group_by(age_with_months) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age_with_months)
head(pf.fc_by_age_months)
## # A tibble: 6 x 4
## age_with_months friend_count_mean friend_count_median n
## <dbl> <dbl> <dbl> <int>
## 1 13.2 46.3 30.5 6
## 2 13.2 115. 23.5 14
## 3 13.3 136. 44 25
## 4 13.4 164. 72 33
## 5 13.5 131. 66 45
## 6 13.6 157. 64 54
ggplot(aes(x = age_with_months, y = friend_count_mean), data = subset(pf.fc_by_age_months, age_with_months <71)) +
geom_line()
p1 <- ggplot(aes(x = age, y = friend_count_mean), data = subset(pf.fc_by_age, age <71)) +
geom_line()+
geom_smooth()
p2 <- ggplot(aes(x = age_with_months, y = friend_count_mean), data = subset(pf.fc_by_age_months, age_with_months <71)) +
geom_line()+
geom_smooth()
p3 <- ggplot(aes(x = round(age / 5) * 5, y = friend_count), data = subset(pf, age <71)) +
geom_line(stat = 'summary', fun.y = mean)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p2, p1, p3, ncol = 1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Notes: In exploratory data analysis we often create multiple visualizations and summaries of the same data, gleaning different incites from each.
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!